Множественная проверка гипотез


In [2]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests

Foodmart product sales


In [3]:
sales = pd.read_csv('foodmart.sales.tsv', sep = '\t', header = 0, parse_dates = [2])

In [4]:
sales.head()


Out[4]:
product_id store_id date sales
0 4 6 1997-01-01 4
1 25 6 1997-01-01 3
2 48 6 1997-01-01 3
3 76 6 1997-01-01 4
4 119 6 1997-01-01 3

In [5]:
products = pd.read_csv('foodmart.products.tsv', sep = '\t', header = 0)

In [6]:
products.head()


Out[6]:
product_class_id product_id brand_name product_name SKU SRP gross_weight net_weight recyclable_package low_fat units_per_case cases_per_pallet shelf_width shelf_height shelf_depth
0 30 1 Washington Washington Berry Juice 90748583674 2.85 8.39 6.39 False False 30 14 16.9 12.60 7.40
1 52 2 Washington Washington Mango Drink 96516502499 0.74 7.42 4.42 False True 18 8 13.4 3.71 22.60
2 52 3 Washington Washington Strawberry Drink 58427771925 0.83 13.10 11.10 True True 17 13 14.4 11.00 7.77
3 19 4 Washington Washington Cream Soda 64412155747 3.64 10.60 9.60 True False 26 10 22.9 18.90 7.93
4 19 5 Washington Washington Diet Soda 85561191439 2.19 6.66 4.65 True False 7 10 20.7 21.90 19.20

In [7]:
sales = sales.merge(products[['product_id', 'product_name']], 
                    on = ['product_id'], how = 'inner')

In [8]:
sales.head()


Out[8]:
product_id store_id date sales product_name
0 4 6 1997-01-01 4 Washington Cream Soda
1 4 7 1997-01-05 3 Washington Cream Soda
2 4 6 1997-01-06 2 Washington Cream Soda
3 4 17 1997-01-11 2 Washington Cream Soda
4 4 24 1997-01-11 2 Washington Cream Soda

Корреляция Пирсона


In [9]:
sparse_sales = pd.pivot_table(sales, values='sales', index=['date', 'store_id'],
                     columns=['product_name'], fill_value = 0)

In [10]:
sparse_sales.head()


Out[10]:
product_name ADJ Rosy Sunglasses Akron City Map Akron Eyeglass Screwdriver American Beef Bologna American Chicken Hot Dogs American Cole Slaw American Corned Beef American Foot-Long Hot Dogs American Low Fat Bologna American Low Fat Cole Slaw ... Washington Apple Juice Washington Berry Juice Washington Cola Washington Cranberry Juice Washington Cream Soda Washington Diet Cola Washington Diet Soda Washington Mango Drink Washington Orange Juice Washington Strawberry Drink
date store_id
1997-01-01 6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 4 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1997-01-02 11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1997-01-03 7 0 0 0 0 0 0 0 0 0 0 ... 0 4 0 0 0 0 0 0 0 0

5 rows × 1559 columns


In [11]:
%%time 
corr_data = []

for i, lhs_column in enumerate(sparse_sales.columns):
    for j, rhs_column in enumerate(sparse_sales.columns):
        if i >= j:
            continue
        
        corr, p = pearsonr(sparse_sales[lhs_column], sparse_sales[rhs_column])
        corr_data.append([lhs_column, rhs_column, corr, p])


Wall time: 1min 33s

In [12]:
sales_correlation = pd.DataFrame.from_records(corr_data)
sales_correlation.columns = ['product_A', 'product_B', 'corr', 'p']

In [13]:
sales_correlation.head()


Out[13]:
product_A product_B corr p
0 ADJ Rosy Sunglasses Akron City Map 0.076608 0.032414
1 ADJ Rosy Sunglasses Akron Eyeglass Screwdriver -0.006581 0.854396
2 ADJ Rosy Sunglasses American Beef Bologna 0.038685 0.280546
3 ADJ Rosy Sunglasses American Chicken Hot Dogs 0.041105 0.251529
4 ADJ Rosy Sunglasses American Cole Slaw -0.045887 0.200484

Сколько гипотез об отсутствии корреляции отвергается без поправки на множественную проверку?


In [14]:
(sales_correlation.p < 0.05).value_counts()


Out[14]:
False    982453
True     232008
Name: p, dtype: int64

Поправка на множественную проверку

Метод Холма


In [15]:
reject, p_corrected, a1, a2 = multipletests(sales_correlation.p, 
                                            alpha = 0.05, 
                                            method = 'holm')

In [16]:
sales_correlation['p_corrected'] = p_corrected
sales_correlation['reject'] = reject

In [17]:
sales_correlation.head()


Out[17]:
product_A product_B corr p p_corrected reject
0 ADJ Rosy Sunglasses Akron City Map 0.076608 0.032414 1 False
1 ADJ Rosy Sunglasses Akron Eyeglass Screwdriver -0.006581 0.854396 1 False
2 ADJ Rosy Sunglasses American Beef Bologna 0.038685 0.280546 1 False
3 ADJ Rosy Sunglasses American Chicken Hot Dogs 0.041105 0.251529 1 False
4 ADJ Rosy Sunglasses American Cole Slaw -0.045887 0.200484 1 False

In [18]:
sales_correlation.reject.value_counts()


Out[18]:
False    1212733
True        1728
Name: reject, dtype: int64

In [19]:
sales_correlation[sales_correlation.reject == True].sort_values(by='corr', ascending=False)


Out[19]:
product_A product_B corr p p_corrected reject
1063670 Just Right Vegetable Soup Plato French Roast Coffee 0.340598 1.226033e-22 1.488970e-16 True
885574 Great Muffins Nationeel Grape Fruit Roll 0.322176 2.688803e-20 3.265443e-14 True
473067 Club Low Fat Cottage Cheese Skinner Strawberry Drink 0.306701 1.883995e-18 2.288034e-12 True
1181001 Robust Monthly Home Magazine Tri-State Lemons 0.303269 4.674973e-18 5.677558e-12 True
1160248 Pleasant Regular Ramen Soup Shady Lake Ravioli 0.298502 1.619119e-17 1.966350e-11 True
885261 Great Muffins High Quality Copper Cleaner 0.297160 2.287551e-17 2.778130e-11 True
1151585 Plato Salt Tri-State Onions 0.291224 1.032464e-16 1.253881e-10 True
1092548 Mighty Good Monthly Home Magazine Tell Tale Baby Onion 0.290458 1.250831e-16 1.519077e-10 True
678154 Even Better Buttermilk Sunset Toilet Paper 0.290046 1.386446e-16 1.683774e-10 True
914945 Hermanos Macintosh Apples Plato French Roast Coffee 0.289095 1.757608e-16 2.134531e-10 True
296176 Booker Blueberry Yogurt Fast Strawberry Fruit Roll 0.288495 2.040104e-16 2.477606e-10 True
1195806 Sunset Glass Cleaner Tell Tale Green Pepper 0.287523 2.595168e-16 3.151701e-10 True
778504 Faux Products Mint Mouthwash Skinner Cream Soda 0.285697 4.069434e-16 4.942121e-10 True
45043 BBB Best Apple Butter Carrington Frozen Carrots 0.284404 5.584481e-16 6.782062e-10 True
490841 Colony Pumpernickel Bread Imagine Frozen Broccoli 0.284278 5.759605e-16 6.994735e-10 True
922125 Hermanos Plums Tell Tale Summer Squash 0.284008 6.151860e-16 7.471101e-10 True
812401 Fort West Raisins Sunset AA-Size Batteries 0.281877 1.031856e-15 1.253133e-09 True
834549 Golden Blueberry Waffles Super Regular Coffee 0.280110 1.579078e-15 1.917702e-09 True
87152 BBB Best Vegetable Oil Tri-State Elephant Garlic 0.279151 1.986864e-15 2.412933e-09 True
623896 Dollar Monthly Home Magazine Portsmouth Chardonnay Wine 0.278492 2.325036e-15 2.823621e-09 True
98754 Best Choice Cheese Dip Nationeel Fudge Brownies 0.276663 3.589792e-15 4.359591e-09 True
1132103 PigTail Beef TV Dinner Tell Tale Asparagus 0.273964 6.774349e-15 8.227040e-09 True
1148273 Plato French Roast Coffee Tell Tale Macintosh Apples 0.273905 6.869042e-15 8.342032e-09 True
423539 Carrington Frozen Chicken Breast Red Wing Screw Driver 0.273705 7.196552e-15 8.739766e-09 True
364300 CDR Extra Chunky Peanut Butter Imagine Frozen Sausage Pizza 0.273332 7.851595e-15 9.535267e-09 True
60735 BBB Best Decaf Coffee Gulf Coast Bubble Gum 0.273142 8.207575e-15 9.967575e-09 True
353311 CDR Apple Preserves Cutting Edge Potato Salad 0.273096 8.296192e-15 1.007519e-08 True
921798 Hermanos Plums Nationeel Graham Crackers 0.272966 8.552863e-15 1.038689e-08 True
777975 Faux Products Mint Mouthwash Hermanos Macintosh Apples 0.272103 1.045324e-14 1.269477e-08 True
641278 Ebony Firm Tofu Sunset Counter Cleaner 0.270295 1.588138e-14 1.928686e-08 True
... ... ... ... ... ... ...
513582 Consolidated Extra Moisture Shampoo Lake Corned Beef 0.195186 3.896997e-08 4.726133e-02 True
436952 Carrington Ice Cream Horatio Raspberry Fruit Roll 0.195180 3.900854e-08 4.730807e-02 True
847133 Golden Frozen Sausage Pizza Landslide Apple Butter 0.195165 3.910777e-08 4.742837e-02 True
933970 High Quality 75 Watt Lightbulb Plato French Roast Coffee 0.195161 3.913360e-08 4.745966e-02 True
44287 Atomic White Chocolate Bar Modell Muffins 0.195138 3.928076e-08 4.763809e-02 True
505488 Consolidated Apricot Shampoo Pleasant Turkey Noodle Soup 0.195116 3.941970e-08 4.780656e-02 True
790723 Fort West Cheese Crackers Top Measure Chardonnay 0.195099 3.952766e-08 4.793744e-02 True
733912 Fast Beef Jerky Good Light Wine 0.195064 3.975083e-08 4.820805e-02 True
1159301 Pleasant Fancy Canned Sardines Steady Mint Mouthwash 0.195059 3.978462e-08 4.824900e-02 True
927407 Hermanos Summer Squash Super Strawberry Preserves 0.195031 3.996668e-08 4.846975e-02 True
815542 Fort West Sesame Crackers Horatio Fudge Cookies 0.194993 4.021331e-08 4.876881e-02 True
725774 Fantastic English Muffins Tell Tale Mandarin Oranges 0.194990 4.023652e-08 4.879692e-02 True
771540 Faux Products Childrens Cold Remedy Horatio Chocolate Chip Cookies 0.194986 4.025788e-08 4.882279e-02 True
509641 Consolidated Conditioning Shampoo Footnote Extra Lean Hamburger 0.194986 4.026006e-08 4.882538e-02 True
532932 Cormorant C-Size Batteries Washington Strawberry Drink 0.194982 4.028907e-08 4.886053e-02 True
511031 Consolidated Deodorant High Quality Soft Napkins 0.194981 4.029434e-08 4.886688e-02 True
1097081 Modell Wheat Bread Washington Apple Juice 0.194977 4.031823e-08 4.889581e-02 True
1077587 Landslide Corn Oil Plato Chunky Peanut Butter 0.194963 4.041312e-08 4.901084e-02 True
338688 Bravo Fancy Canned Oysters Carlson String Cheese 0.194958 4.044278e-08 4.904678e-02 True
1125069 Nationeel Potato Chips Tell Tale Canned Peanuts 0.194947 4.051404e-08 4.913316e-02 True
1031716 Imagine Frozen Pepperoni Pizza Tri-State Prepared Salad 0.194947 4.051638e-08 4.913596e-02 True
35031 Atomic Malted Milk Balls Just Right Vegetable Soup 0.194939 4.056627e-08 4.919642e-02 True
3896 Akron Eyeglass Screwdriver Hermanos Macintosh Apples 0.194929 4.063596e-08 4.928090e-02 True
1101432 Moms Potato Salad National Small Brown Eggs 0.194920 4.069757e-08 4.935558e-02 True
209481 Big Time Grape Popsicles Pleasant Vegetable Soup 0.194915 4.072982e-08 4.939464e-02 True
662127 Ebony Potatos Just Right Fancy Canned Anchovies 0.194914 4.073740e-08 4.940380e-02 True
565740 Cutting Edge Cole Slaw Horatio Potato Chips 0.194875 4.099170e-08 4.971215e-02 True
615865 Discover Ravioli High Quality Soft Napkins 0.194869 4.103075e-08 4.975947e-02 True
559150 Cormorant Toilet Bowl Cleaner Plato Chunky Peanut Butter 0.194863 4.107241e-08 4.980995e-02 True
149050 Better Beef Soup Pleasant Canned Yams 0.194856 4.112254e-08 4.987070e-02 True

1728 rows × 6 columns

Метод Бенджамини-Хохберга


In [20]:
reject, p_corrected, a1, a2 = multipletests(sales_correlation.p, 
                                            alpha = 0.05, 
                                            method = 'fdr_bh')

In [21]:
sales_correlation['p_corrected'] = p_corrected
sales_correlation['reject'] = reject

In [22]:
sales_correlation.head()


Out[22]:
product_A product_B corr p p_corrected reject
0 ADJ Rosy Sunglasses Akron City Map 0.076608 0.032414 0.203716 False
1 ADJ Rosy Sunglasses Akron Eyeglass Screwdriver -0.006581 0.854396 0.956078 False
2 ADJ Rosy Sunglasses American Beef Bologna 0.038685 0.280546 0.630699 False
3 ADJ Rosy Sunglasses American Chicken Hot Dogs 0.041105 0.251529 0.600790 False
4 ADJ Rosy Sunglasses American Cole Slaw -0.045887 0.200484 0.541916 False

In [23]:
sales_correlation.reject.value_counts()


Out[23]:
False    1138407
True       76054
Name: reject, dtype: int64

In [24]:
sales_correlation[sales_correlation.reject == True].sort_values(by='corr', ascending=False)


Out[24]:
product_A product_B corr p p_corrected reject
1063670 Just Right Vegetable Soup Plato French Roast Coffee 0.340598 1.226033e-22 1.488970e-16 True
885574 Great Muffins Nationeel Grape Fruit Roll 0.322176 2.688803e-20 1.632723e-14 True
473067 Club Low Fat Cottage Cheese Skinner Strawberry Drink 0.306701 1.883995e-18 7.626793e-13 True
1181001 Robust Monthly Home Magazine Tri-State Lemons 0.303269 4.674973e-18 1.419393e-12 True
1160248 Pleasant Regular Ramen Soup Shady Lake Ravioli 0.298502 1.619119e-17 3.932713e-12 True
885261 Great Muffins High Quality Copper Cleaner 0.297160 2.287551e-17 4.630236e-12 True
1151585 Plato Salt Tri-State Onions 0.291224 1.032464e-16 1.791267e-11 True
1092548 Mighty Good Monthly Home Magazine Tell Tale Baby Onion 0.290458 1.250831e-16 1.870872e-11 True
678154 Even Better Buttermilk Sunset Toilet Paper 0.290046 1.386446e-16 1.870872e-11 True
914945 Hermanos Macintosh Apples Plato French Roast Coffee 0.289095 1.757608e-16 2.134546e-11 True
296176 Booker Blueberry Yogurt Fast Strawberry Fruit Roll 0.288495 2.040104e-16 2.252388e-11 True
1195806 Sunset Glass Cleaner Tell Tale Green Pepper 0.287523 2.595168e-16 2.626442e-11 True
778504 Faux Products Mint Mouthwash Skinner Cream Soda 0.285697 4.069434e-16 3.801669e-11 True
45043 BBB Best Apple Butter Carrington Frozen Carrots 0.284404 5.584481e-16 4.663210e-11 True
490841 Colony Pumpernickel Bread Imagine Frozen Broccoli 0.284278 5.759605e-16 4.663210e-11 True
922125 Hermanos Plums Tell Tale Summer Squash 0.284008 6.151860e-16 4.669496e-11 True
812401 Fort West Raisins Sunset AA-Size Batteries 0.281877 1.031856e-15 7.371465e-11 True
834549 Golden Blueberry Waffles Super Regular Coffee 0.280110 1.579078e-15 1.065405e-10 True
87152 BBB Best Vegetable Oil Tri-State Elephant Garlic 0.279151 1.986864e-15 1.269983e-10 True
623896 Dollar Monthly Home Magazine Portsmouth Chardonnay Wine 0.278492 2.325036e-15 1.411833e-10 True
98754 Best Choice Cheese Dip Nationeel Fudge Brownies 0.276663 3.589792e-15 2.076030e-10 True
1132103 PigTail Beef TV Dinner Tell Tale Asparagus 0.273964 6.774349e-15 3.627036e-10 True
1148273 Plato French Roast Coffee Tell Tale Macintosh Apples 0.273905 6.869042e-15 3.627036e-10 True
423539 Carrington Frozen Chicken Breast Red Wing Screw Driver 0.273705 7.196552e-15 3.641638e-10 True
364300 CDR Extra Chunky Peanut Butter Imagine Frozen Sausage Pizza 0.273332 7.851595e-15 3.709685e-10 True
60735 BBB Best Decaf Coffee Gulf Coast Bubble Gum 0.273142 8.207575e-15 3.709685e-10 True
353311 CDR Apple Preserves Cutting Edge Potato Salad 0.273096 8.296192e-15 3.709685e-10 True
921798 Hermanos Plums Nationeel Graham Crackers 0.272966 8.552863e-15 3.709685e-10 True
777975 Faux Products Mint Mouthwash Hermanos Macintosh Apples 0.272103 1.045324e-14 4.377606e-10 True
641278 Ebony Firm Tofu Sunset Counter Cleaner 0.270295 1.588138e-14 6.429108e-10 True
... ... ... ... ... ... ...
503962 Consolidated Angled Toothbrush Hilltop Multi-Symptom Cold Remedy 0.105676 3.127646e-03 4.996220e-02 True
685978 Even Better Low Fat Sour Cream Imagine Frozen Sausage Pizza 0.105676 3.127694e-03 4.996220e-02 True
189763 Big Time Fajita French Fries Tell Tale Tomatos 0.105675 3.127756e-03 4.996220e-02 True
1083389 Landslide Pepper Shady Lake Rice Medly 0.105675 3.127760e-03 4.996220e-02 True
899101 Hermanos Baby Onion Lake Pimento Loaf 0.105675 3.127797e-03 4.996220e-02 True
1210319 Tip Top Lox Tri-State Limes 0.105675 3.127831e-03 4.996220e-02 True
819313 Framton Eyeglass Screwdriver Nationeel Grape Fruit Roll 0.105675 3.127870e-03 4.996220e-02 True
272090 Blue Label Creamed Corn Modell Bagels 0.105675 3.127971e-03 4.996315e-02 True
1013449 Horatio Low Fat Chips Plato Tomato Sauce 0.105674 3.128012e-03 4.996315e-02 True
452542 Choice Mint Chocolate Bar Cormorant Paper Towels 0.105674 3.128079e-03 4.996333e-02 True
1073971 Landslide Apple Jam Red Spade Foot-Long Hot Dogs 0.105674 3.128105e-03 4.996333e-02 True
1202407 Super Regular Coffee Tell Tale Mushrooms 0.105673 3.128419e-03 4.996769e-02 True
319623 Booker Whole Milk PigTail Lime Popsicles 0.105672 3.128675e-03 4.997112e-02 True
863249 Good Light Wine Landslide Strawberry Jam 0.105671 3.129087e-03 4.997703e-02 True
832433 Golden Apple Cinnamon Waffles Landslide Canola Oil 0.105670 3.129285e-03 4.997955e-02 True
184296 Big Time Beef TV Dinner Carlson String Cheese 0.105669 3.129490e-03 4.998204e-02 True
205949 Big Time Frozen Pepperoni Pizza Even Better String Cheese 0.105669 3.129564e-03 4.998204e-02 True
244890 Bird Call Laundry Detergent Tri-State Mushrooms 0.105669 3.129583e-03 4.998204e-02 True
703436 Excellent Cola Monarch Manicotti 0.105669 3.129606e-03 4.998204e-02 True
254265 Black Tie City Map PigTail Frozen Sausage Pizza 0.105667 3.130062e-03 4.998866e-02 True
997510 Hilltop Toothpaste Tell Tale Broccoli 0.105667 3.130214e-03 4.999043e-02 True
463130 Club Blueberry Yogurt PigTail Blueberry Waffles 0.105666 3.130379e-03 4.999242e-02 True
1025556 Imagine Frozen Carrots Thresher Bubble Gum 0.105666 3.130446e-03 4.999283e-02 True
7211 American Chicken Hot Dogs Just Right Canned Peas 0.105665 3.130594e-03 4.999454e-02 True
282728 Blue Label Rice Soup Gulf Coast Mint Chocolate Bar 0.105665 3.130693e-03 4.999516e-02 True
108018 Best Choice Dried Dates Tell Tale Party Nuts 0.105665 3.130716e-03 4.999516e-02 True
419799 Carrington Frozen Carrots Sphinx Muffins 0.105664 3.130854e-03 4.999649e-02 True
1184578 Skinner Diet Cola Special Wheat Puffs 0.105664 3.130881e-03 4.999649e-02 True
821999 Gauss Monthly Fashion Magazine PigTail Beef TV Dinner 0.105664 3.130969e-03 4.999724e-02 True
484896 Colony Bagels Lake Roasted Chicken 0.105664 3.131013e-03 4.999728e-02 True

76054 rows × 6 columns

Корреляционный метод


In [ ]:
%%time 
corr_data = []

for i, lhs_column in enumerate(sparse_sales.columns):
    for j, rhs_column in enumerate(sparse_sales.columns):
        if i >= j:
            continue
        
        corr, p = pearsonr(sparse_sales[lhs_column], sparse_sales[rhs_column])
        prod = (sparse_sales[lhs_column]-np.mean(sparse_sales[lhs_column])) * (sparse_sales[rhs_column]-np.mean(sparse_sales[rhs_column]))
        theta = np.mean((prod - np.cov(sparse_sales)[1,2])**2)
        T = sum(prod) / np.sqrt(n * theta)
        corr_data.append([lhs_column, rhs_column, corr, p, T])

In [48]:
sales_correlation = pd.DataFrame.from_records(corr_data)
sales_correlation.columns = ['product_A', 'product_B', 'corr', 'p', 'T']

In [ ]:
alpha = 1
k = df.shape[1]
a = 2 * np.log(np.log(k))
b = np.sqrt(4 * np.log(p) - a)